#import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import missingno as msno
import matplotlib.pyplot as plt
import plotly.graph_objects as go

#read the df and extract the movies from the type column and create a new df
df=pd.read_csv("C:/Users/rafae/OneDrive/Escritorio/Badededatosexcel/netflix_titles.csv")
movies=pd.DataFrame(df[df["type"]=="Movie"])
#Show the first 3 rows of the new df
movies.head(3)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 6 | s7 | Movie | My Little Pony: A New Generation | Robert Cullen, José Luis Ucha | Vanessa Hudgens, Kimiko Glenn, James Marsden, ... | NaN | September 24, 2021 | 2021 | PG | 91 min | Children & Family Movies | Equestria's divided. But a bright-eyed hero be... |
| 7 | s8 | Movie | Sankofa | Haile Gerima | Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D... | United States, Ghana, Burkina Faso, United Kin... | September 24, 2021 | 1993 | TV-MA | 125 min | Dramas, Independent Movies, International Movies | On a photo shoot in Ghana, an American model s... |
#create the visualization of the missing data using msno and matplotlib
msno.bar(df,fontsize =10, sort = 'descending', figsize = (12,6))
plt.title("MISSING VAUES",fontsize=25)
plt.show()
# Show columns with missing values
movies[movies.columns[movies.isnull().any()]].isnull().sum()
director 188 cast 475 country 440 rating 2 duration 3 dtype: int64
#replace the missing values in the case of director and cast, since they are not so important, leave them as "noavailable" in the case of country
#since it was netflix from the United States, I thought that most of the movies would be from that place, rating and duration, I replaced it with mode
movies["director"]=movies["director"].fillna("NoAvailable")
movies["cast"]=movies["cast"].fillna("NoAvailable")
movies["country"]=movies["country"].fillna("United States")
movies["rating"]=movies["rating"].fillna(movies["rating"].mode())
movies["duration"]=movies["duration"].fillna(movies["duration"].mode())
#Create new column from date_added with month, month name and year occupying dt
movies["date_added"]=pd.to_datetime(movies["date_added"])
movies["month_added"]=movies["date_added"].dt.month
movies['month_name_added']=movies['date_added'].dt.month_name()
movies['year_added'] = movies['date_added'].dt.year
# split the duration column into duration and min
movies[["duration_min","min"]]=(movies["duration"].str.split(" " , expand=True))
movies["duration_min"]=movies["duration_min"].fillna(movies["rating"].mode())
#these columns have list values in some rows so what I do is create a new df to analyze each value in detail
movies_listed_in = movies.set_index('title').listed_in.str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
movies_cast = movies.set_index('title').cast.str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
movies_director = movies.set_index('title').director.str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
movies_country = movies.set_index('title').country.str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
df=movies
fig = px.histogram(df, x="release_year",color="release_year",color_discrete_sequence=["aquamarine"],title="Histogram of movies added by year")
fig.show()
df = movies
fig = px.box(df, x="release_year",color_discrete_sequence=["blueviolet"],title="Distribution of the years")
fig.show()
month=movies["month_name_added"].value_counts()
fig = px.bar(month, x="month_name_added",color="month_name_added",title="Added in months",labels={"index":"months","duration_min":"counts"})
fig.show()
df1=pd.DataFrame(movies["duration_min"].dropna().value_counts().reset_index())
fig = px.histogram(df1, x="index",y="duration_min",color="duration_min",color_discrete_sequence=["darkblue"],labels={"sum of duration_min":"count","index":"Duration"},title="Duration")
fig.show()
df =pd.DataFrame(movies["duration_min"].dropna().value_counts().reset_index())
fig = px.violin(df, y="duration_min", box=True, # draw box plot inside the violin
points='all', # can be 'outliers', or False
labels={"index":"Duration"},
title="Graphs of duration values in which netflix movies are distributed"
)
fig.show()
df=movies_country.value_counts().reset_index()
fig = px.pie(df, values=0 , names='index', title='Population country',labels={"index":"country","0":"counts"})
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()
df=pd.DataFrame(movies_cast.value_counts().reset_index()[1:21])
df=df.rename(columns={"index":"cast", 0:"counts"})
fig = px.funnel(df, x='cast', y='counts',color_discrete_sequence=["chocolate"],title="top 20 actors")
fig.show()
df=pd.DataFrame(movies_listed_in.value_counts().reset_index())
df=df.rename(columns={"index":"Categories", 0:"counts"})
fig = px.bar(df, x="Categories", y="counts", color_discrete_sequence=["teal"],title="Movies categories")
fig.show()
df=pd.DataFrame(movies_director.value_counts().reset_index()[1:])
df=df.rename(columns={"index":"Directors", 0:"counts"})
fig = px.box(df, x="counts",title="Number of films directed")
fig.show()
df=pd.DataFrame(movies_director.value_counts().reset_index()[1:11])
df=df.rename(columns={"index":"Directors", 0:"counts"})
fig =px.pie(df, values="counts" , names='Directors', title='10 directors',hole=.35)
fig.show()
df = movies[["release_year","month_name_added","duration","rating"]]
fig = px.scatter_matrix(df)
fig.show()
df = movies
fig = px.density_heatmap(df, x="release_year", y="rating", marginal_x="histogram", marginal_y="histogram", text_auto=True)
fig.show()
year_country2 = df.groupby('year_added')['country'].value_counts().reset_index(name='counts')
fig = px.choropleth(year_country2, locations="country", color="counts",
locationmode='country names',
animation_frame='year_added',
range_color=[0,200],
color_continuous_scale=px.colors.sequential.OrRd
)
fig.update_layout(title='Comparison by country')
fig.show()
#children
column="year_added"
kids = movies[movies["rating"]=="TV-Y7"]
kids = kids[column].value_counts().reset_index()
kids = kids.rename(columns = {column : "count", "index" : column})
kids = kids.sort_values(column)
t1 = go.Scatter(x=kids[column], y=kids["count"], name="Kids", marker=dict(color="#EB2B11"))
data = [ t1]
layout = go.Layout(title="Kids movies added from 2015-2021", legend=dict(x=0.3, y=1.2, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
column="year_added"
tv14 = movies[movies["rating"]=="TV-14"]
pg13 = movies[movies["rating"]=="PG-13"]
tv14 = tv14[column].value_counts().reset_index()
tv14 = tv14.rename(columns = {column : "count", "index" : column})
tv14 = tv14.sort_values(column)
pg13 = pg13[column].value_counts().reset_index()
pg13 = pg13.rename(columns = {column : "count", "index" : column})
pg13 = pg13.sort_values(column)
t1 = go.Scatter(x=tv14[column], y=tv14["count"], name="TV-14", marker=dict(color="#EB2B11"))
t2 = go.Scatter(x=pg13[column], y=pg13["count"], name="PG-13", marker=dict(color="#291D1B"))
data = [t1,t2]
layout = go.Layout(title="Teens movies added from 2011-2021", legend=dict(x=0.3, y=1.2, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
column="year_added"
ur = movies[movies["rating"]=="UR"]
nr = movies[movies["rating"]=="NR"]
nc17 = movies[movies["rating"]=="NC-17"]
r = movies[movies["rating"]=="R"]
tvma = movies[movies["rating"]=="TV-MA"]
ur=ur[column].value_counts().reset_index()
ur= ur.rename(columns = {column : "count", "index" : column})
ur=ur.sort_values(column)
nr = nr[column].value_counts().reset_index()
nr = nr.rename(columns = {column : "count", "index" : column})
nr = nr.sort_values(column)
nc17 = nc17[column].value_counts().reset_index()
nc17 = nc17.rename(columns = {column : "count", "index" : column})
nc17 = nc17.sort_values(column)
r = r[column].value_counts().reset_index()
r = r.rename(columns = {column : "count", "index" : column})
r = r.sort_values(column)
tvma = tvma[column].value_counts().reset_index()
tvma = tvma.rename(columns = {column : "count", "index" : column})
tvma = tvma.sort_values(column)
t1 = go.Scatter(x=ur[column], y=ur["count"], name="UR", marker=dict(color="#E75D0E"))
t2 = go.Scatter(x=nr[column], y=nr["count"], name="NR", marker=dict(color="#D3E70E"))
t3 = go.Scatter(x=nc17[column], y=nc17["count"], name="NC-17", marker=dict(color="#0EE4E7"))
t4 = go.Scatter(x=r[column], y=r["count"], name="R", marker=dict(color="#0E6AE7"))
t5 = go.Scatter(x=tvma[column], y=tvma["count"], name="TV-MA", marker=dict(color="#E70E77"))
data = [t1,t2,t3,t4,t5]
layout = go.Layout(title="Adults movies added from 2011-2021", legend=dict(x=0.3, y=1.2, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
column="year_added"
ur = movies[movies["rating"]=="UR"]
nr = movies[movies["rating"]=="NR"]
nc17 = movies[movies["rating"]=="NC-17"]
r = movies[movies["rating"]=="R"]
tvma = movies[movies["rating"]=="TV-MA"]
tv14 = movies[movies["rating"]=="TV-14"]
pg13 = movies[movies["rating"]=="PG-13"]
kids = movies[movies["rating"]=="TV-Y7"]
ur=ur[column].value_counts().reset_index()
ur= ur.rename(columns = {column : "count", "index" : column})
ur=ur.sort_values(column)
nr = nr[column].value_counts().reset_index()
nr = nr.rename(columns = {column : "count", "index" : column})
nr = nr.sort_values(column)
nc17 = nc17[column].value_counts().reset_index()
nc17 = nc17.rename(columns = {column : "count", "index" : column})
nc17 = nc17.sort_values(column)
r = r[column].value_counts().reset_index()
r = r.rename(columns = {column : "count", "index" : column})
r = r.sort_values(column)
tvma = tvma[column].value_counts().reset_index()
tvma = tvma.rename(columns = {column : "count", "index" : column})
tvma = tvma.sort_values(column)
tv14 = tv14[column].value_counts().reset_index()
tv14 = tv14.rename(columns = {column : "count", "index" : column})
tv14 = tv14.sort_values(column)
pg13 = pg13[column].value_counts().reset_index()
pg13 = pg13.rename(columns = {column : "count", "index" : column})
pg13 = pg13.sort_values(column)
kids = kids[column].value_counts().reset_index()
kids = kids.rename(columns = {column : "count", "index" : column})
kids = kids.sort_values(column)
t1 = go.Scatter(x=ur[column], y=ur["count"], name="UR", marker=dict(color="#C7C7C7"))
t2 = go.Scatter(x=nr[column], y=nr["count"], name="NR", marker=dict(color="#C7C7C7"))
t3 = go.Scatter(x=nc17[column], y=nc17["count"], name="NC-17", marker=dict(color="#C7C7C7"))
t4 = go.Scatter(x=r[column], y=r["count"], name="R", marker=dict(color="#C7C7C7"))
t5 = go.Scatter(x=tvma[column], y=tvma["count"], name="TV-MA", marker=dict(color="#C7C7C7"))
t6 = go.Scatter(x=tv14[column], y=tv14["count"], name="TV-14", marker=dict(color="#291D1B"))
t7 = go.Scatter(x=pg13[column], y=pg13["count"], name="PG-13", marker=dict(color="#291D1B"))
t8 = go.Scatter(x=kids[column], y=kids["count"], name="Kids", marker=dict(color="#EB2B11"))
data = [t1,t2,t3,t4,t5,t6,t7,t8]
layout = go.Layout(title="movies added ", legend=dict(x=0.3, y=1.2, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()